! python -m spacy download en_core_web_sm
! pip install sentence-transformers umap-learn bertopic
! pip install spacy
! pip install openpyxl
! pip install hdbscan
! pip install datasets
! pip install transformers torch
! pip install --upgrade sentence_transformers umap-learn hdbscan bertopic
Collecting en-core-web-sm==3.7.1
Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 2.6 MB/s eta 0:00:0000:0100:01
Requirement already satisfied: spacy<3.8.0,>=3.7.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from en-core-web-sm==3.7.1) (3.7.4)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.8)
Requirement already satisfied: numpy>=1.19.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.26.4)
Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.2.3)
Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.9)
Requirement already satisfied: setuptools in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (65.6.3)
Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.10)
Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.5)
Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.4.8)
Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.3.4)
Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.3.0)
Requirement already satisfied: packaging>=20.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (22.0)
Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.1.2)
Requirement already satisfied: typer<0.10.0,>=0.3.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.9.4)
Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (5.2.1)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.10)
Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.7.0)
Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.12)
Requirement already satisfied: jinja2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.1.2)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.28.1)
Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.64.1)
Requirement already satisfied: pydantic-core==2.18.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.18.1)
Requirement already satisfied: typing-extensions>=4.6.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.9.0)
Requirement already satisfied: annotated-types>=0.4.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.6.0)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.26.14)
Requirement already satisfied: certifi>=2017.4.17 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2023.5.7)
Requirement already satisfied: idna<4,>=2.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.4)
Requirement already satisfied: charset-normalizer<3,>=2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.4)
Requirement already satisfied: blis<0.8.0,>=0.7.8 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.7.11)
Requirement already satisfied: confection<1.0.0,>=0.0.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.1.4)
Requirement already satisfied: click<9.0.0,>=7.1.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from typer<0.10.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.1.7)
Requirement already satisfied: cloudpathlib<0.17.0,>=0.7.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from weasel<0.4.0,>=0.1.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.16.0)
Requirement already satisfied: MarkupSafe>=2.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from jinja2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.1.1)
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')
Requirement already satisfied: sentence-transformers in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (2.7.0)
Requirement already satisfied: umap-learn in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (0.5.6)
Requirement already satisfied: bertopic in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (0.16.2)
Requirement already satisfied: Pillow in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence-transformers) (9.4.0)
Requirement already satisfied: scipy in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence-transformers) (1.10.0)
Requirement already satisfied: numpy in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence-transformers) (1.26.4)
Requirement already satisfied: tqdm in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence-transformers) (4.64.1)
Requirement already satisfied: huggingface-hub>=0.15.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence-transformers) (0.22.2)
Requirement already satisfied: transformers<5.0.0,>=4.34.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence-transformers) (4.39.3)
Requirement already satisfied: torch>=1.11.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence-transformers) (1.12.1)
Requirement already satisfied: scikit-learn in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence-transformers) (1.4.2)
Requirement already satisfied: pynndescent>=0.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from umap-learn) (0.5.12)
Requirement already satisfied: numba>=0.51.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from umap-learn) (0.59.1)
Requirement already satisfied: plotly>=4.7.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from bertopic) (5.9.0)
Requirement already satisfied: pandas>=1.1.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from bertopic) (2.0.1)
Requirement already satisfied: hdbscan>=0.8.29 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from bertopic) (0.8.33)
Requirement already satisfied: cython<3,>=0.27 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from hdbscan>=0.8.29->bertopic) (0.29.37)
Requirement already satisfied: joblib>=1.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from hdbscan>=0.8.29->bertopic) (1.2.0)
Requirement already satisfied: pyyaml>=5.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence-transformers) (6.0)
Requirement already satisfied: requests in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence-transformers) (2.28.1)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence-transformers) (4.9.0)
Requirement already satisfied: filelock in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence-transformers) (3.9.0)
Requirement already satisfied: fsspec>=2023.5.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence-transformers) (2024.3.1)
Requirement already satisfied: packaging>=20.9 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence-transformers) (22.0)
Requirement already satisfied: llvmlite<0.43,>=0.42.0dev0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from numba>=0.51.2->umap-learn) (0.42.0)
Requirement already satisfied: python-dateutil>=2.8.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pandas>=1.1.5->bertopic) (2.8.2)
Requirement already satisfied: tzdata>=2022.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pandas>=1.1.5->bertopic) (2023.3)
Requirement already satisfied: pytz>=2020.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pandas>=1.1.5->bertopic) (2022.7)
Requirement already satisfied: tenacity>=6.2.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from plotly>=4.7.0->bertopic) (8.0.1)
Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from scikit-learn->sentence-transformers) (2.2.0)
Requirement already satisfied: tokenizers<0.19,>=0.14 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers<5.0.0,>=4.34.0->sentence-transformers) (0.15.2)
Requirement already satisfied: safetensors>=0.4.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers<5.0.0,>=4.34.0->sentence-transformers) (0.4.3)
Requirement already satisfied: regex!=2019.12.17 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers<5.0.0,>=4.34.0->sentence-transformers) (2022.7.9)
Requirement already satisfied: six>=1.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas>=1.1.5->bertopic) (1.16.0)
Requirement already satisfied: certifi>=2017.4.17 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence-transformers) (2023.5.7)
Requirement already satisfied: idna<4,>=2.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence-transformers) (3.4)
Requirement already satisfied: charset-normalizer<3,>=2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence-transformers) (2.0.4)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence-transformers) (1.26.14)
Requirement already satisfied: spacy in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (3.7.4)
Requirement already satisfied: jinja2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (3.1.2)
Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (5.2.1)
Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (3.3.0)
Requirement already satisfied: setuptools in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (65.6.3)
Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (0.3.4)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (2.0.8)
Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (8.2.3)
Requirement already satisfied: numpy>=1.19.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (1.26.4)
Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (3.0.12)
Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (2.4.8)
Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (4.64.1)
Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (2.7.0)
Requirement already satisfied: typer<0.10.0,>=0.3.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (0.9.4)
Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (1.1.2)
Requirement already satisfied: packaging>=20.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (22.0)
Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (2.0.10)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (1.0.10)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (2.28.1)
Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (3.0.9)
Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from spacy) (1.0.5)
Requirement already satisfied: pydantic-core==2.18.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (2.18.1)
Requirement already satisfied: typing-extensions>=4.6.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (4.9.0)
Requirement already satisfied: annotated-types>=0.4.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (0.6.0)
Requirement already satisfied: charset-normalizer<3,>=2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.0.4)
Requirement already satisfied: certifi>=2017.4.17 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2023.5.7)
Requirement already satisfied: idna<4,>=2.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.4)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy) (1.26.14)
Requirement already satisfied: blis<0.8.0,>=0.7.8 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from thinc<8.3.0,>=8.2.2->spacy) (0.7.11)
Requirement already satisfied: confection<1.0.0,>=0.0.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from thinc<8.3.0,>=8.2.2->spacy) (0.1.4)
Requirement already satisfied: click<9.0.0,>=7.1.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from typer<0.10.0,>=0.3.0->spacy) (8.1.7)
Requirement already satisfied: cloudpathlib<0.17.0,>=0.7.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from weasel<0.4.0,>=0.1.0->spacy) (0.16.0)
Requirement already satisfied: MarkupSafe>=2.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from jinja2->spacy) (2.1.1)
Requirement already satisfied: openpyxl in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (3.0.10)
Requirement already satisfied: et_xmlfile in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from openpyxl) (1.1.0)
Requirement already satisfied: hdbscan in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (0.8.33)
Requirement already satisfied: scikit-learn>=0.20 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from hdbscan) (1.4.2)
Requirement already satisfied: numpy>=1.20 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from hdbscan) (1.26.4)
Requirement already satisfied: cython<3,>=0.27 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from hdbscan) (0.29.37)
Requirement already satisfied: scipy>=1.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from hdbscan) (1.10.0)
Requirement already satisfied: joblib>=1.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from hdbscan) (1.2.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from scikit-learn>=0.20->hdbscan) (2.2.0)
Requirement already satisfied: datasets in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (2.19.1)
Requirement already satisfied: numpy>=1.17 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (1.26.4)
Requirement already satisfied: requests>=2.19.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (2.28.1)
Requirement already satisfied: multiprocess in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (0.70.16)
Requirement already satisfied: pyyaml>=5.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (6.0)
Requirement already satisfied: pyarrow>=12.0.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (16.0.0)
Requirement already satisfied: filelock in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (3.9.0)
Requirement already satisfied: xxhash in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (3.4.1)
Requirement already satisfied: huggingface-hub>=0.21.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (0.22.2)
Requirement already satisfied: pandas in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (2.0.1)
Requirement already satisfied: dill<0.3.9,>=0.3.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (0.3.8)
Requirement already satisfied: packaging in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (22.0)
Requirement already satisfied: fsspec[http]<=2024.3.1,>=2023.1.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (2024.3.1)
Requirement already satisfied: tqdm>=4.62.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (4.64.1)
Requirement already satisfied: pyarrow-hotfix in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (0.6)
Requirement already satisfied: aiohttp in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from datasets) (3.9.5)
Requirement already satisfied: yarl<2.0,>=1.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from aiohttp->datasets) (1.9.4)
Requirement already satisfied: async-timeout<5.0,>=4.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from aiohttp->datasets) (4.0.3)
Requirement already satisfied: multidict<7.0,>=4.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from aiohttp->datasets) (6.0.5)
Requirement already satisfied: aiosignal>=1.1.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from aiohttp->datasets) (1.3.1)
Requirement already satisfied: frozenlist>=1.1.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from aiohttp->datasets) (1.4.1)
Requirement already satisfied: attrs>=17.3.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from aiohttp->datasets) (23.2.0)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.21.2->datasets) (4.9.0)
Requirement already satisfied: charset-normalizer<3,>=2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (2.0.4)
Requirement already satisfied: certifi>=2017.4.17 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (2023.5.7)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (1.26.14)
Requirement already satisfied: idna<4,>=2.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (3.4)
Requirement already satisfied: tzdata>=2022.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pandas->datasets) (2023.3)
Requirement already satisfied: pytz>=2020.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pandas->datasets) (2022.7)
Requirement already satisfied: python-dateutil>=2.8.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pandas->datasets) (2.8.2)
Requirement already satisfied: six>=1.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)
Requirement already satisfied: transformers in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (4.39.3)
Requirement already satisfied: torch in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (1.12.1)
Requirement already satisfied: tqdm>=4.27 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers) (4.64.1)
Requirement already satisfied: tokenizers<0.19,>=0.14 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers) (0.15.2)
Requirement already satisfied: huggingface-hub<1.0,>=0.19.3 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers) (0.22.2)
Requirement already satisfied: pyyaml>=5.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers) (6.0)
Requirement already satisfied: numpy>=1.17 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers) (1.26.4)
Requirement already satisfied: requests in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers) (2.28.1)
Requirement already satisfied: safetensors>=0.4.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers) (0.4.3)
Requirement already satisfied: regex!=2019.12.17 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers) (2022.7.9)
Requirement already satisfied: filelock in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers) (3.9.0)
Requirement already satisfied: packaging>=20.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers) (22.0)
Requirement already satisfied: typing_extensions in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from torch) (4.9.0)
Requirement already satisfied: fsspec>=2023.5.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.19.3->transformers) (2024.3.1)
Requirement already satisfied: certifi>=2017.4.17 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests->transformers) (2023.5.7)
Requirement already satisfied: idna<4,>=2.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests->transformers) (3.4)
Requirement already satisfied: charset-normalizer<3,>=2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests->transformers) (2.0.4)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests->transformers) (1.26.14)
Requirement already satisfied: sentence_transformers in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (2.7.0)
Requirement already satisfied: umap-learn in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (0.5.6)
Requirement already satisfied: hdbscan in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (0.8.33)
Requirement already satisfied: bertopic in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (0.16.2)
Requirement already satisfied: Pillow in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (9.4.0)
Requirement already satisfied: torch>=1.11.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (1.12.1)
Requirement already satisfied: numpy in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (1.26.4)
Requirement already satisfied: scikit-learn in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (1.4.2)
Requirement already satisfied: transformers<5.0.0,>=4.34.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (4.39.3)
Requirement already satisfied: huggingface-hub>=0.15.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (0.22.2)
Requirement already satisfied: scipy in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (1.10.0)
Requirement already satisfied: tqdm in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (4.64.1)
Requirement already satisfied: pynndescent>=0.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from umap-learn) (0.5.12)
Requirement already satisfied: numba>=0.51.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from umap-learn) (0.59.1)
Requirement already satisfied: cython<3,>=0.27 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from hdbscan) (0.29.37)
Requirement already satisfied: joblib>=1.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from hdbscan) (1.2.0)
Requirement already satisfied: plotly>=4.7.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from bertopic) (5.9.0)
Requirement already satisfied: pandas>=1.1.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from bertopic) (2.0.1)
Requirement already satisfied: filelock in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (3.9.0)
Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (4.9.0)
Requirement already satisfied: packaging>=20.9 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (22.0)
Requirement already satisfied: pyyaml>=5.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (6.0)
Requirement already satisfied: fsspec>=2023.5.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (2024.3.1)
Requirement already satisfied: requests in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (2.28.1)
Requirement already satisfied: llvmlite<0.43,>=0.42.0dev0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from numba>=0.51.2->umap-learn) (0.42.0)
Requirement already satisfied: pytz>=2020.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pandas>=1.1.5->bertopic) (2022.7)
Requirement already satisfied: python-dateutil>=2.8.2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pandas>=1.1.5->bertopic) (2.8.2)
Requirement already satisfied: tzdata>=2022.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from pandas>=1.1.5->bertopic) (2023.3)
Requirement already satisfied: tenacity>=6.2.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from plotly>=4.7.0->bertopic) (8.0.1)
Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from scikit-learn->sentence_transformers) (2.2.0)
Requirement already satisfied: regex!=2019.12.17 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers<5.0.0,>=4.34.0->sentence_transformers) (2022.7.9)
Requirement already satisfied: tokenizers<0.19,>=0.14 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers<5.0.0,>=4.34.0->sentence_transformers) (0.15.2)
Requirement already satisfied: safetensors>=0.4.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from transformers<5.0.0,>=4.34.0->sentence_transformers) (0.4.3)
Requirement already satisfied: six>=1.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas>=1.1.5->bertopic) (1.16.0)
Requirement already satisfied: certifi>=2017.4.17 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (2023.5.7)
Requirement already satisfied: idna<4,>=2.5 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (3.4)
Requirement already satisfied: charset-normalizer<3,>=2 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (2.0.4)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/helgegeurtjacobusmoes/anaconda3/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (1.26.14)
import os
import pandas as pd
import openpyxl
from datetime import datetime
import matplotlib.pyplot as plt
from textblob import TextBlob
import ast
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re
from nltk.tokenize import word_tokenize
from collections import Counter
from bertopic import BERTopic
import random
import numpy as np
import torch
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import matplotlib.cm as cm
from collections import Counter
from umap import UMAP
from wordcloud import WordCloud
import seaborn as sns
from scipy.stats import f_oneway
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from gensim import corpora, models
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from scipy.cluster.hierarchy import linkage, fcluster
# Download necessary NLTK resources
nltk.download('punkt') # Tokenizer model
nltk.download('stopwords') # Stopwords list
nltk.download('wordnet') # Lexical database for lemmatization
nltk.download('omw-1.4') # Open Multilingual Wordnet, needed for lemmatization in multiple languages
[nltk_data] Downloading package punkt to [nltk_data] /Users/helgegeurtjacobusmoes/nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package stopwords to [nltk_data] /Users/helgegeurtjacobusmoes/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package wordnet to [nltk_data] /Users/helgegeurtjacobusmoes/nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package omw-1.4 to [nltk_data] /Users/helgegeurtjacobusmoes/nltk_data... [nltk_data] Package omw-1.4 is already up-to-date!
True
# Load the Excel file into a DataFrame
excel_path = '/Users/helgegeurtjacobusmoes/Desktop/thesis data/Updated_Merged_Data.xlsx'
# Load the Excel file into a DataFrame
updated_merged_data = pd.read_excel(excel_path)
updated_merged_data
| Headline | Publication | URL | News Outlet | Type of News | Word Count | Body | Publication Date | |
|---|---|---|---|---|---|---|---|---|
| 0 | Nee, kunstmatige intelligentie gaat ons niet u... | Trouw, Verdieping; Blz. 4, 5, 2044 words | https://advance.lexis.com/api/document?collect... | Trouw | Verdieping | 2044 | Welkom in de AI-fabriek serie\nDat kunstmatige... | 7 december 2023 donderdag |
| 1 | Wereldleiders zoeken grip op kunstmatige intel... | Trouw, Vandaag; Blz. 6, 528 words | https://advance.lexis.com/api/document?collect... | Trouw | Vandaag | 528 | Op het Britse landgoed Bletchley Park werden t... | 3 november 2023 vrijdag |
| 2 | Kunstmatige intelligentie is best bedreigend | Trouw, Tijdgeest; Blz. 8, 576 words | https://advance.lexis.com/api/document?collect... | Trouw | Tijdgeest | 576 | Of kunstmatige intelligentie nuttig is (Tijdge... | 13 mei 2023 zaterdag |
| 3 | Mensen zijn een stuk efficiënter dan kunstmati... | Trouw, Vandaag; Blz. 3, 741 words | https://advance.lexis.com/api/document?collect... | Trouw | Vandaag | 741 | De wereld raakte het afgelopen jaar in de ban ... | 21 oktober 2023 zaterdag |
| 4 | Bedreigt kunstmatige intelligentie ons godsbeeld? | Trouw, Religie en Filosofie; Blz. 8, 9, 1367 w... | https://advance.lexis.com/api/document?collect... | Trouw | Religie en Filosofie | 1367 | Theologisch elftal\n'In het begin was het Woor... | 16 december 2022 vrijdag |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6441 | De rauwe realiteit | Het Financieele Dagblad, MORGEN; Blz. 4, 2920 ... | https://advance.lexis.com/api/document?collect... | Het Financieele Dagblad | MORGEN | 2920 | Grootse oplossingen\nDrie stedelijke 'ontwrich... | 14 oktober 2017 zaterdag 12:00 AM GMT |
| 6442 | No Headline In Original | Het Financieele Dagblad, PAGINA 13; Blz. 13, 1... | https://advance.lexis.com/api/document?collect... | Het Financieele Dagblad | PAGINA | 114 | klinkt als muziek\nDe Walkman, van Sony, is vo... | 29 april 2023 zaterdag 12:00 AM GMT |
| 6443 | Groeten uit het hart van de hightech | Het Financieele Dagblad, WEEKEND; Blz. 6, 2799... | https://advance.lexis.com/api/document?collect... | Het Financieele Dagblad | WEEKEND | 2799 | Het is zover voor 'onze man in San Francisco'.... | 20 augustus 2016 zaterdag 12:00 AM GMT |
| 6444 | De complete lijst Jonge Talenten 2019 | Het Financieele Dagblad, FD PERSOONLIJK; Arbei... | https://advance.lexis.com/api/document?collect... | Het Financieele Dagblad | FD PERSOONLIJK; Arbeidsmarkt | 8007 | Rebel werkte zes jaar bij zakenbank Morgan Sta... | 17 januari 2019 donderdag 1:00 PM GMT |
| 6445 | No Headline In Original | Het Financieele Dagblad, DE WERELD; Blz. 30, 9... | https://advance.lexis.com/api/document?collect... | Het Financieele Dagblad | DE WERELD | 969 | The Conversation (Londen)Gates Notes (VS)The E... | 8 december 2018 zaterdag 12:00 AM GMT |
6446 rows × 8 columns
# Load Dutch stopwords
stop_words_nl = set(stopwords.words('dutch'))
# Define a preprocess_text function
def preprocess_text(text):
# Convert to lowercase
text = text.lower()
# Tokenize text
words = word_tokenize(text)
# Remove numbers
words = [re.sub(r'\d+', '', word) for word in words]
# Remove punctuation and special characters
words = [word for word in words if word.isalnum()]
# Remove stopwords
stop_words = set(stopwords.words('Dutch'))
stop_words.update(['No Headline In Original', 'trouw', 'volkskrant', 'financieele', 'algemeen', 'dagblad', 'nrc', 'telegraaf'])
words = [word for word in words if word not in stop_words]
# Lemmatize words
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(word) for word in words]
# Join words back to text
text = ' '.join(words)
return text
# Create the Combined column and preprocess it
updated_merged_data["Combined"] = updated_merged_data["Headline"].fillna("") + " " + updated_merged_data["Body"].fillna("")
updated_merged_data["Combined"] = updated_merged_data["Combined"].apply(preprocess_text)
# Ensure there are no NaN values in the Combined column
updated_merged_data.dropna(subset=["Combined"], inplace=True)
updated_merged_data.reset_index(drop=True, inplace=True)
updated_merged_data
| Headline | Publication | URL | News Outlet | Type of News | Word Count | Body | Publication Date | Combined | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Nee, kunstmatige intelligentie gaat ons niet u... | Trouw, Verdieping; Blz. 4, 5, 2044 words | https://advance.lexis.com/api/document?collect... | Trouw | Verdieping | 2044 | Welkom in de AI-fabriek serie\nDat kunstmatige... | 7 december 2023 donderdag | nee kunstmatige intelligentie gaat uitroeien w... |
| 1 | Wereldleiders zoeken grip op kunstmatige intel... | Trouw, Vandaag; Blz. 6, 528 words | https://advance.lexis.com/api/document?collect... | Trouw | Vandaag | 528 | Op het Britse landgoed Bletchley Park werden t... | 3 november 2023 vrijdag | wereldleiders zoeken grip kunstmatige intellig... |
| 2 | Kunstmatige intelligentie is best bedreigend | Trouw, Tijdgeest; Blz. 8, 576 words | https://advance.lexis.com/api/document?collect... | Trouw | Tijdgeest | 576 | Of kunstmatige intelligentie nuttig is (Tijdge... | 13 mei 2023 zaterdag | kunstmatige intelligentie best bedreigend kuns... |
| 3 | Mensen zijn een stuk efficiënter dan kunstmati... | Trouw, Vandaag; Blz. 3, 741 words | https://advance.lexis.com/api/document?collect... | Trouw | Vandaag | 741 | De wereld raakte het afgelopen jaar in de ban ... | 21 oktober 2023 zaterdag | mensen stuk efficiënter kunstmatige intelligen... |
| 4 | Bedreigt kunstmatige intelligentie ons godsbeeld? | Trouw, Religie en Filosofie; Blz. 8, 9, 1367 w... | https://advance.lexis.com/api/document?collect... | Trouw | Religie en Filosofie | 1367 | Theologisch elftal\n'In het begin was het Woor... | 16 december 2022 vrijdag | bedreigt kunstmatige intelligentie godsbeeld t... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6441 | De rauwe realiteit | Het Financieele Dagblad, MORGEN; Blz. 4, 2920 ... | https://advance.lexis.com/api/document?collect... | Het Financieele Dagblad | MORGEN | 2920 | Grootse oplossingen\nDrie stedelijke 'ontwrich... | 14 oktober 2017 zaterdag 12:00 AM GMT | rauwe realiteit grootse oplossingen drie stede... |
| 6442 | No Headline In Original | Het Financieele Dagblad, PAGINA 13; Blz. 13, 1... | https://advance.lexis.com/api/document?collect... | Het Financieele Dagblad | PAGINA | 114 | klinkt als muziek\nDe Walkman, van Sony, is vo... | 29 april 2023 zaterdag 12:00 AM GMT | no headline original klinkt muziek walkman son... |
| 6443 | Groeten uit het hart van de hightech | Het Financieele Dagblad, WEEKEND; Blz. 6, 2799... | https://advance.lexis.com/api/document?collect... | Het Financieele Dagblad | WEEKEND | 2799 | Het is zover voor 'onze man in San Francisco'.... | 20 augustus 2016 zaterdag 12:00 AM GMT | groeten hart hightech zover man san francisco ... |
| 6444 | De complete lijst Jonge Talenten 2019 | Het Financieele Dagblad, FD PERSOONLIJK; Arbei... | https://advance.lexis.com/api/document?collect... | Het Financieele Dagblad | FD PERSOONLIJK; Arbeidsmarkt | 8007 | Rebel werkte zes jaar bij zakenbank Morgan Sta... | 17 januari 2019 donderdag 1:00 PM GMT | complete lijst jonge talenten rebel werkte z j... |
| 6445 | No Headline In Original | Het Financieele Dagblad, DE WERELD; Blz. 30, 9... | https://advance.lexis.com/api/document?collect... | Het Financieele Dagblad | DE WERELD | 969 | The Conversation (Londen)Gates Notes (VS)The E... | 8 december 2018 zaterdag 12:00 AM GMT | no headline original the conversation londen g... |
6446 rows × 9 columns
# Dictionary for Dutch to English month translation
dutch_months = {
"januari": "January", "februari": "February", "maart": "March",
"april": "April", "mei": "May", "juni": "June",
"juli": "July", "augustus": "August", "september": "September",
"oktober": "October", "november": "November", "december": "December"
}
# Function to translate Dutch month names to English and format the date
def translate_date(date_str):
if pd.isna(date_str):
return None # Return None if the date is NaN
try:
parts = date_str.split()
if len(parts) >= 3:
day = parts[0].zfill(2) # Ensure day is two digits
month_dutch = parts[1].lower()
year = parts[2]
month_english = dutch_months.get(month_dutch, None)
if not month_english:
return None # Return None if the month is not found
date_str_english = f"{day}-{month_english}-{year}"
date_obj = datetime.strptime(date_str_english, "%d-%B-%Y")
return date_obj.strftime("%d-%m-%Y")
except Exception as e:
print(f"Error parsing date '{date_str}': {e}")
return None
# Apply the translation and conversion function to the 'Publication Date' column
updated_merged_data['Publication Date'] = updated_merged_data['Publication Date'].apply(translate_date)
# Print out some of the cleaned data to verify
updated_merged_data
| Headline | Publication | URL | News Outlet | Type of News | Word Count | Body | Publication Date | Combined | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Nee, kunstmatige intelligentie gaat ons niet u... | Trouw, Verdieping; Blz. 4, 5, 2044 words | https://advance.lexis.com/api/document?collect... | Trouw | Verdieping | 2044 | Welkom in de AI-fabriek serie\nDat kunstmatige... | 07-12-2023 | nee kunstmatige intelligentie gaat uitroeien w... |
| 1 | Wereldleiders zoeken grip op kunstmatige intel... | Trouw, Vandaag; Blz. 6, 528 words | https://advance.lexis.com/api/document?collect... | Trouw | Vandaag | 528 | Op het Britse landgoed Bletchley Park werden t... | 03-11-2023 | wereldleiders zoeken grip kunstmatige intellig... |
| 2 | Kunstmatige intelligentie is best bedreigend | Trouw, Tijdgeest; Blz. 8, 576 words | https://advance.lexis.com/api/document?collect... | Trouw | Tijdgeest | 576 | Of kunstmatige intelligentie nuttig is (Tijdge... | 13-05-2023 | kunstmatige intelligentie best bedreigend kuns... |
| 3 | Mensen zijn een stuk efficiënter dan kunstmati... | Trouw, Vandaag; Blz. 3, 741 words | https://advance.lexis.com/api/document?collect... | Trouw | Vandaag | 741 | De wereld raakte het afgelopen jaar in de ban ... | 21-10-2023 | mensen stuk efficiënter kunstmatige intelligen... |
| 4 | Bedreigt kunstmatige intelligentie ons godsbeeld? | Trouw, Religie en Filosofie; Blz. 8, 9, 1367 w... | https://advance.lexis.com/api/document?collect... | Trouw | Religie en Filosofie | 1367 | Theologisch elftal\n'In het begin was het Woor... | 16-12-2022 | bedreigt kunstmatige intelligentie godsbeeld t... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6441 | De rauwe realiteit | Het Financieele Dagblad, MORGEN; Blz. 4, 2920 ... | https://advance.lexis.com/api/document?collect... | Het Financieele Dagblad | MORGEN | 2920 | Grootse oplossingen\nDrie stedelijke 'ontwrich... | 14-10-2017 | rauwe realiteit grootse oplossingen drie stede... |
| 6442 | No Headline In Original | Het Financieele Dagblad, PAGINA 13; Blz. 13, 1... | https://advance.lexis.com/api/document?collect... | Het Financieele Dagblad | PAGINA | 114 | klinkt als muziek\nDe Walkman, van Sony, is vo... | 29-04-2023 | no headline original klinkt muziek walkman son... |
| 6443 | Groeten uit het hart van de hightech | Het Financieele Dagblad, WEEKEND; Blz. 6, 2799... | https://advance.lexis.com/api/document?collect... | Het Financieele Dagblad | WEEKEND | 2799 | Het is zover voor 'onze man in San Francisco'.... | 20-08-2016 | groeten hart hightech zover man san francisco ... |
| 6444 | De complete lijst Jonge Talenten 2019 | Het Financieele Dagblad, FD PERSOONLIJK; Arbei... | https://advance.lexis.com/api/document?collect... | Het Financieele Dagblad | FD PERSOONLIJK; Arbeidsmarkt | 8007 | Rebel werkte zes jaar bij zakenbank Morgan Sta... | 17-01-2019 | complete lijst jonge talenten rebel werkte z j... |
| 6445 | No Headline In Original | Het Financieele Dagblad, DE WERELD; Blz. 30, 9... | https://advance.lexis.com/api/document?collect... | Het Financieele Dagblad | DE WERELD | 969 | The Conversation (Londen)Gates Notes (VS)The E... | 08-12-2018 | no headline original the conversation londen g... |
6446 rows × 9 columns
# Save the cleaned DataFrame to a new Excel file
updated_merged_data.to_excel("/Users/helgegeurtjacobusmoes/Desktop/thesis data/Cleaned_Updated_Merged_Data.xlsx", index=False)
# Save the cleaned DataFrame to a new CSV file
updated_merged_data.to_csv("/Users/helgegeurtjacobusmoes/Desktop/thesis data/Cleaned_Updated_Merged_Data.csv", index=False)
import pandas as pd
import re
# Load the data from the Excel file
file_path = "/Users/helgegeurtjacobusmoes/Desktop/thesis data/Cleaned_Updated_Merged_Data.xlsx"
data = pd.read_excel(file_path)
# Ensure all entries in the 'Combined' column are treated as strings
data['Combined'] = data['Combined'].astype(str)
# Count the occurrences of each word in the 'Combined' column
data['AI Count'] = data['Combined'].str.count(r'\bAI\b', flags=re.IGNORECASE)
data['Kunstmatige Intelligentie Count'] = data['Combined'].str.count(r'\bKunstmatige Intelligentie\b', flags=re.IGNORECASE)
data['Artificial Intelligence Count'] = data['Combined'].str.count(r'\bArtificial Intelligence\b', flags=re.IGNORECASE)
# Sum the counts for each word across all rows
total_ai_count = data['AI Count'].sum()
total_ki_count = data['Kunstmatige Intelligentie Count'].sum()
total_ai_full_count = data['Artificial Intelligence Count'].sum()
print(f"Total 'AI' count: {total_ai_count}")
print(f"Total 'Kunstmatige Intelligentie' count: {total_ki_count}")
print(f"Total 'Artificial Intelligence' count: {total_ai_full_count}")
Total 'AI' count: 7889 Total 'Kunstmatige Intelligentie' count: 10442 Total 'Artificial Intelligence' count: 276
This code outlines the procedure for topic modeling using advanced natural language processing (NLP) techniques, specifically utilizing the BERTopic model which incorporates embeddings, dimensionality reduction (UMAP), and clustering (HDBSCAN) to automatically categorize Dutch text data into topics:
Setting Environment Variable:
os.environ["OMP_MAX_ACTIVE_LEVELS"] = "2"Loading a Pre-trained Embedding Model:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2', language="Dutch")Creating UMAP and HDBSCAN Models:
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom')Initializing and Fitting the BERTopic Model:
model = BERTopic(embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model, nr_topics=12)topics, probabilities = model.fit_transform(sentences)Retrieving and Displaying Topic Frequencies:
topic_freq = model.get_topic_freq()Generating Topic Details and Summarization:
Displaying the Result:
print(topic_details_df)This code effectively utilizes advanced machine learning techniques for unsupervised learning to discover and summarize topics within a Dutch text dataset, providing a comprehensive toolset for analyzing text data without predefined categories or topics.
### Load your data from an Excel file
data = pd.read_excel('/Users/helgegeurtjacobusmoes/Desktop/thesis data/Cleaned_Updated_Merged_Data.xlsx')
content_titles = data['Combined'].fillna("").tolist()
### Load a pre-trained Sentence Transformer model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2', language="Dutch")
### BERTopic model (N = 90): in order to use it, take adjust from Markdown to Code
### N = 90
### Set the maximum number of nested active parallel regions
os.environ["OMP_MAX_ACTIVE_LEVELS"] = "2"
### Create an instance of the UMAP and HDBSCAN model with specific parameters
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom')
#Initialize the BERTopic model with the UMAP model and embedding model
model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, embedding_model='all-MiniLM-L6-v2', language='Dutch')
topics, probabilities = model.fit_transform(content_titles) # Fit the BERTopic model on the Dutch sentences to find topics
### Retrieve the most relevant topics sorted by their size (frequency)
topic_freq = model.get_topic_freq()
### Generate the topics and their details with adjustments
topic_details = []
for index, row in topic_freq.iterrows():
topic_info = model.get_topic(row['Topic'])
topic_representation = [word[0] for word in topic_info[:10]] # Top 10 words as a list
topic_name = "_".join(topic_representation[:4]) # Concatenate top 4 words for the name
# Find multiple representative documents, assuming 3 for demonstration
representative_doc_indices = [i for i, t in enumerate(topics) if t == row['Topic']][:3]
representative_docs = [data['Combined'].iloc[i][:200] for i in representative_doc_indices] # First 200 chars of docs
topic_details.append({
'Topic': row['Topic'],
'Count': row['Count'],
'Name': topic_name,
'Representation': topic_representation,
'Representative_Docs': representative_docs
})
### Convert the list of topic details to a DataFrame and sort by topic size
topic_details_df = pd.DataFrame(topic_details)
topic_details_df.sort_values(by='Count', ascending=False, inplace=True)
### Display the DataFrame
print(topic_details_df)
model.get_topics()
{-1: [('we', 0.007486693671432411),
('jaar', 0.005841746514178842),
('mensen', 0.005760335472060161),
('wel', 0.0053916899802409326),
('intelligentie', 0.005176980062808073),
('zegt', 0.005122953483765532),
('kunstmatige', 0.005041534678303338),
('nieuwe', 0.004908017523229031),
('gaat', 0.004751231408093411),
('waar', 0.004696876955222825)],
0: [('robot', 0.036894319081756936),
('men', 0.009610574570887872),
('mensen', 0.009350006461776817),
('we', 0.008977446982004506),
('machine', 0.00819859066770282),
('intelligentie', 0.008019534151900733),
('kunstmatige', 0.007426000831894131),
('banen', 0.006814265713362343),
('werk', 0.00596302546298388),
('gaan', 0.005880697831705383)],
1: [('patiënten', 0.02054571676460878),
('patiënt', 0.01879193747453304),
('zorg', 0.013898342519898497),
('artsen', 0.012666471470354638),
('medische', 0.01208717804474124),
('philip', 0.010798196805020838),
('ziekenhuis', 0.010701476989546876),
('art', 0.010257983328512037),
('ziekenhuizen', 0.008857686281916579),
('zegt', 0.007659157110545914)],
2: [('auto', 0.04843310437881789),
('rijden', 0.015910556461750054),
('zelfrijdende', 0.015823320394147612),
('tesla', 0.012237675204364211),
('elektrische', 0.012154891939629823),
('bestuurder', 0.00968288950438033),
('bmw', 0.009117512960606797),
('kilometer', 0.008714056144262496),
('stuur', 0.008384341857783556),
('mercedes', 0.008144737126493625)],
3: [('beleggers', 0.026467443138532998),
('rente', 0.01610901335310583),
('inflatie', 0.01498591452145292),
('aex', 0.014748895709551435),
('aandelen', 0.014567585566011993),
('amerikaanse', 0.012617841423292532),
('aandeel', 0.01233940965485618),
('centrale', 0.012312847633199259),
('lager', 0.012121198454777196),
('beurs', 0.012107457442931171)],
4: [('chatgpt', 0.03850379806447094),
('chatbot', 0.03411144536881021),
('chatbots', 0.018616167463824263),
('openai', 0.012682621379957747),
('google', 0.009953579546612235),
('bing', 0.009701879674701562),
('teksten', 0.009613864292455254),
('bot', 0.009543269691302208),
('antwoord', 0.009284442108851454),
('antwoorden', 0.009075252583562282)],
5: [('europese', 0.03170400335016237),
('europa', 0.024239461613010913),
('eu', 0.02423497280347295),
('brussel', 0.01570189314577477),
('commissie', 0.014542876104929362),
('china', 0.013457062308428157),
('lidstaten', 0.011853367672038548),
('duitsland', 0.011299785777401388),
('macron', 0.011050098241167783),
('landen', 0.0107575713189513)],
6: [('mln', 0.018129897408156365),
('bedrijven', 0.017915474626858302),
('investeerders', 0.013664917143833085),
('jaar', 0.013406939820455285),
('bedrijf', 0.013202050297342009),
('geld', 0.013168934888138422),
('fonds', 0.012642215878518027),
('investeringen', 0.011206077080362151),
('mrd', 0.010804337063322905),
('nederland', 0.009404007362804825)],
7: [('film', 0.025833011587163484),
('the', 0.014197174045375014),
('sciencefiction', 0.012895229708015047),
('depp', 0.010680551410291204),
('regisseur', 0.009238444502253631),
('uur', 0.008910894908988363),
('genre', 0.00845613946079103),
('alien', 0.008452462082901522),
('machina', 0.008041264615053952),
('westworld', 0.007948262225432665)],
8: [('dieren', 0.027959134970174982),
('natuur', 0.010794302118141613),
('we', 0.010456012053981563),
('soorten', 0.010005038172921043),
('bomen', 0.009659389135932351),
('dier', 0.009588568033883187),
('planten', 0.00800509912776644),
('geluiden', 0.007607768408833419),
('biodiversiteit', 0.006332818893363864),
('haven', 0.006302809649575891)],
9: [('ai', 0.020015044217339985),
('intelligentie', 0.01585315766014987),
('computer', 0.014428266257230453),
('kunstmatige', 0.01381441795455606),
('men', 0.010040684554594215),
('machine', 0.009972216799732881),
('mensen', 0.009512063400004895),
('we', 0.009237523888398335),
('google', 0.00757490235430275),
('menselijke', 0.006635748392810741)],
10: [('democratie', 0.01081273017934664),
('we', 0.008897075371402958),
('politieke', 0.008281272067715357),
('partij', 0.007810137396257425),
('politiek', 0.007741899787388279),
('mensen', 0.007338217756042906),
('partijen', 0.007160627204172948),
('samenleving', 0.006595275516290021),
('onze', 0.005985739397760472),
('kamer', 0.005976853840916498)],
11: [('china', 0.05255032117446615),
('chinese', 0.025933538769083293),
('xi', 0.02005647185063052),
('trump', 0.011700512117273892),
('amerikaanse', 0.011065261366488506),
('landen', 0.010783824369749374),
('beijing', 0.010749226939086406),
('taiwan', 0.010111998813764643),
('land', 0.00995176688549835),
('economische', 0.00950688617621198)],
12: [('brein', 0.03491756621367846),
('hersenen', 0.024459728120974376),
('neuralink', 0.015220383672101233),
('musk', 0.014533443235677734),
('chip', 0.012693202174289712),
('computer', 0.011654856109238337),
('gedachten', 0.010645746681771585),
('we', 0.010127378349971773),
('proefpersonen', 0.009755965645299134),
('neuronen', 0.009401203258197348)],
13: [('roman', 0.018320271607424346),
('boek', 0.013719131023525087),
('frankenstein', 0.013164989492325165),
('verhaal', 0.012834789107015415),
('mary', 0.009142123340811248),
('shelley', 0.009006890324468542),
('leven', 0.008792163828805592),
('jezus', 0.008107720214931944),
('beslisser', 0.007993921982795684),
('fictie', 0.007741907994341967)],
14: [('spel', 0.027329789144501505),
('game', 0.02602762248489176),
('schaken', 0.02397336232212592),
('computer', 0.01745600458746091),
('go', 0.013926178771724345),
('spelers', 0.013372154554443794),
('spelen', 0.013064686927075934),
('speler', 0.012290860464764474),
('spellen', 0.01214900749602897),
('alphago', 0.011181251430792582)],
15: [('fonds', 0.01718232978919989),
('geld', 0.0170836528138853),
('kabinet', 0.015507136279966728),
('economische', 0.013941990209534324),
('hoekstra', 0.013800905329048093),
('euro', 0.013365841160077758),
('wiebes', 0.013280787061013885),
('economie', 0.013135545046681227),
('groeifonds', 0.01193689778422462),
('crisis', 0.011786022407887623)],
16: [('banen', 0.029075959532796364),
('ai', 0.01727987218723208),
('werknemers', 0.016057461555312808),
('arbeidsmarkt', 0.015722247280857804),
('werk', 0.014919887726648146),
('wework', 0.00979397903856953),
('mensen', 0.00907184175858113),
('nieuwe', 0.00853580674894267),
('bedrijven', 0.007893885635176215),
('gaan', 0.007856328827606928)],
17: [('valley', 0.030819368815575558),
('silicon', 0.02871996800888954),
('san', 0.01090180320034695),
('francisco', 0.01050290156722862),
('stanford', 0.01003659502451361),
('bedrijven', 0.008626651291092571),
('facebook', 0.006950928126626263),
('waar', 0.006871898576345437),
('bedrijf', 0.006688764543352133),
('techbedrijven', 0.006368574357398072)],
18: [('microsoft', 0.07270962025768889),
('openai', 0.019374333620835266),
('nadella', 0.018428226602917937),
('alphabet', 0.016695184382550566),
('mrd', 0.016483994434767217),
('apple', 0.015297808245989907),
('dollar', 0.013144978482180228),
('bedrijf', 0.012853638145709667),
('miljard', 0.012706775282784168),
('google', 0.012513094328993096)],
19: [('we', 0.007528589780243601),
('valk', 0.006629608536295808),
('wel', 0.006580696227294249),
('jaar', 0.00639973606913346),
('maria', 0.005802768208002155),
('familie', 0.005714953862863134),
('heel', 0.005592742431633042),
('goed', 0.005571436269298467),
('dhl', 0.005238346012580068),
('nederland', 0.005219273728395737)],
20: [('asml', 0.05457544097786256),
('chip', 0.04910054466749747),
('kwartaal', 0.02645708286768284),
('omzet', 0.021359547783102166),
('wennink', 0.01955391258641443),
('mrd', 0.018885044108444018),
('tsmc', 0.017924419870879896),
('vraag', 0.016652493822619924),
('machine', 0.016089619160050674),
('bedrijf', 0.015661072714873974)],
21: [('china', 0.03304394963518705),
('chinese', 0.032718086928642566),
('overheid', 0.013446449339427438),
('bedrijven', 0.011673661516661096),
('chinezen', 0.010992568995933394),
('peking', 0.010880668129019017),
('amerikaanse', 0.009484798464462202),
('tiktok', 0.009014391535767633),
('data', 0.0075168864984134325),
('huawei', 0.007500668189391401)],
22: [('muziek', 0.04355922681086032),
('piano', 0.02030950012400095),
('universal', 0.018121508902470176),
('artiesten', 0.01722795505273336),
('tiktok', 0.015178493651215607),
('liedje', 0.014924765640932196),
('beethoven', 0.014907882481561883),
('nummer', 0.013391229174223515),
('spotify', 0.013345859805017984),
('stem', 0.012291892137664873)],
23: [('ai', 0.02283307273986205),
('nederland', 0.016999920143917543),
('intelligentie', 0.01595990087832184),
('bedrijven', 0.014806636642981336),
('europa', 0.014608275850770588),
('investeren', 0.014108628091212442),
('kunstmatige', 0.013426913829129871),
('universiteiten', 0.012882362358324097),
('rijke', 0.012827008305885604),
('kabinet', 0.012544011242746777)],
24: [('medium', 0.014184759293968617),
('nepnieuws', 0.013313360069305126),
('verkiezingen', 0.011506851519843255),
('desinformatie', 0.011257252523194518),
('empathie', 0.010856642187642066),
('trump', 0.009510688763654357),
('sociale', 0.008940136027120672),
('facebook', 0.008637944800341397),
('hamas', 0.008326415013536124),
('deepfakes', 0.0074517721712175225)],
25: [('kunst', 0.018623475795978978),
('nachtwacht', 0.01834239840411389),
('museum', 0.01731988235630514),
('rembrandt', 0.017022868762467166),
('schilderij', 0.015137371325980446),
('werk', 0.014536727011108523),
('kunstenaar', 0.011448652448489173),
('rijksmuseum', 0.010718256950097073),
('kunstenaars', 0.010623974295294419),
('zien', 0.009411188466704908)],
26: [('china', 0.05659312419989554),
('asml', 0.052236535494392854),
('chip', 0.050337745550440176),
('geavanceerde', 0.028479297854525448),
('chinese', 0.02811836239547994),
('amerikaanse', 0.024563009090252733),
('export', 0.01951203536144393),
('chipmachines', 0.016712161910090863),
('nanometer', 0.014910724276986299),
('machine', 0.014405894830052246)],
27: [('we', 0.014831080602368343),
('mensheid', 0.014614574617482565),
('aarde', 0.010760385177413678),
('rees', 0.01075292985442079),
('toekomst', 0.009583059162370086),
('ai', 0.009296941103872434),
('men', 0.00878249942767476),
('risico', 0.008493905576458263),
('mensen', 0.007591491517162478),
('leven', 0.0074035398094496226)],
28: [('europese', 0.031041140471569476),
('ai', 0.03045766751641336),
('regels', 0.029648282200635916),
('parlement', 0.02331293192866366),
('act', 0.02057936663468225),
('europees', 0.019629179011075287),
('wet', 0.01830853027051129),
('gezichtsherkenning', 0.017526351979474157),
('commissie', 0.016882362876275748),
('eu', 0.01660525919491919)],
29: [('digitale', 0.026640486416918113),
('digitalisering', 0.02029556797538793),
('overheid', 0.012953276052304919),
('burger', 0.012782211629625426),
('we', 0.012507854398083513),
('data', 0.0121576476508313),
('onze', 0.010684634520230274),
('stad', 0.009904874409303838),
('technologie', 0.009205533091304114),
('politieke', 0.007465818252487621)],
30: [('facebook', 0.07362798990987332),
('zuckerberg', 0.02227912090320167),
('gebruikers', 0.01566244146484264),
('berichten', 0.01295317382572982),
('platform', 0.00982646413459673),
('bedrijf', 0.009646861643307525),
('mensen', 0.00956030658121607),
('erik', 0.009161775236406034),
('nepnieuws', 0.009106100744170339),
('charon', 0.008714566640963605)],
31: [('onderwijs', 0.03317593682511713),
('studenten', 0.02912873187124027),
('universiteiten', 0.02109638135362824),
('universiteit', 0.015728112754281848),
('school', 0.012269018684016423),
('scholen', 0.012241692800035838),
('studie', 0.01161492990082033),
('technische', 0.011109649894253684),
('hoger', 0.010379882463721173),
('student', 0.009858215311171882)],
32: [('nvidia', 0.1382643492761839),
('chip', 0.039101525348793194),
('huang', 0.03263532195478313),
('intel', 0.020780675887409304),
('bedrijf', 0.020642361948870758),
('grafische', 0.019567704589832487),
('mrd', 0.017942506163389615),
('jensen', 0.017275434305214905),
('aandeel', 0.016494566138732316),
('omzet', 0.01625381773195205)],
33: [('coronavirus', 0.01996576214599905),
('china', 0.019407004473973515),
('virus', 0.0162571321070852),
('chinese', 0.010789865432575589),
('ggd', 0.009271750289954266),
('corona', 0.009031967577803751),
('palantir', 0.00901515736706658),
('ademtest', 0.00816679573979107),
('pandemie', 0.007483301553037099),
('antisemitische', 0.007339871655480623)],
34: [('foto', 0.039847876224386625),
('gezichtsherkenning', 0.03342935492184306),
('gezichten', 0.02105758048190932),
('clearview', 0.019312662972965522),
('camera', 0.01567420935545189),
('gezicht', 0.012890046842553203),
('technologie', 0.010578424245542136),
('software', 0.009425047368751407),
('politie', 0.008835769201594698),
('google', 0.00879840853934739)],
35: [('winkel', 0.02540748955722635),
('zelfscankassa', 0.024162619436767608),
('klanten', 0.018897124443852734),
('winkeldiefstal', 0.01587445521649987),
('winkels', 0.01538491273235142),
('diefstal', 0.014799843325932415),
('supermarkten', 0.014770676446910077),
('supermarkt', 0.0139787956130453),
('jumbo', 0.01359793855836441),
('boodschappen', 0.013411632165475706)],
36: [('acteurs', 0.06179829590791042),
('staking', 0.061241210964896646),
('schrijvers', 0.04659947836446929),
('hollywood', 0.042574317046335054),
('streamingdiensten', 0.031688083596126876),
('wga', 0.026833757233060706),
('studio', 0.02608735095031966),
('series', 0.024464478577851875),
('film', 0.02438477732222389),
('vakbond', 0.022641272413075442)],
37: [('samsung', 0.07838884673408475),
('apple', 0.02072110477118594),
('smartphone', 0.017475769158254006),
('telefoons', 0.017185277019363454),
('huawei', 0.017084772328089956),
('telefoon', 0.015755056589734664),
('lee', 0.014156562685128419),
('iphone', 0.0138529218261527),
('galaxy', 0.012614670387571268),
('nieuwe', 0.01180484239346215)],
38: [('apple', 0.11222821236149473),
('iphone', 0.033639690205776514),
('cook', 0.018244713504359366),
('kinderporno', 0.015675455763230234),
('google', 0.013944139074658583),
('auto', 0.011173352352370384),
('iphones', 0.010939457039879684),
('amazon', 0.010677318916326825),
('jaar', 0.010624317139389104),
('nieuwe', 0.010226730864612963)],
39: [('journalistiek', 0.02437992171608028),
('journalisten', 0.020096238875964692),
('nieuws', 0.018634344266459307),
('artikelen', 0.014315531955046062),
('ai', 0.01364353057946019),
('medium', 0.010237571935797121),
('krant', 0.009800068360765229),
('mensen', 0.009452105588919643),
('informatie', 0.00938257374893693),
('channel', 0.008789438160282706)],
40: [('algoritmes', 0.03186138509593045),
('data', 0.02905795876899499),
('algoritmen', 0.021501745176739733),
('register', 0.02014453477121616),
('big', 0.01704017172076839),
('burger', 0.016376090130409665),
('gemeente', 0.01495775382190589),
('algoritme', 0.013804508762642125),
('toezicht', 0.013534813679555967),
('overheid', 0.012134597172895677)],
41: [('nederland', 0.02545478613080527),
('economie', 0.012440853139932283),
('bedrijven', 0.011577138467594662),
('landen', 0.010367445847695545),
('nederlandse', 0.009631545549985),
('schwarz', 0.009524851656561596),
('volberda', 0.009065504051425415),
('investeringen', 0.008771879299791655),
('we', 0.008743880119130885),
('wapenfeit', 0.008726124348079832)],
42: [('deepfakes', 0.04069482968773314),
('video', 0.024093761523222547),
('deepfake', 0.023708647367245567),
('beelden', 0.015584472389228827),
('porno', 0.014621918150414394),
('foto', 0.012946894389676248),
('deepfakeporno', 0.012501959801129385),
('seksueel', 0.012079087112129806),
('vrouwen', 0.011616380999576391),
('slachtoffer', 0.011480126720331961)],
43: [('bank', 0.048623689926658185),
('banken', 0.0481008927654361),
('ing', 0.02610760315206898),
('financiële', 0.021370235433630827),
('revolut', 0.019857046430125078),
('fintech', 0.01890675776403956),
('klanten', 0.016402667555210956),
('abn', 0.01632346049466571),
('sector', 0.015592290604082706),
('amro', 0.015054952281422935)],
44: [('studenten', 0.03823428686360691),
('docenten', 0.023542144413973218),
('onderwijs', 0.022271720767519626),
('leerlingen', 0.02069375249449165),
('programmeren', 0.017713637782671213),
('leren', 0.014987330654177195),
('chatgpt', 0.014261773105292974),
('vaardigheden', 0.013975193533944181),
('opleidingen', 0.013898894138339992),
('scholen', 0.013745955221577035)],
45: [('film', 0.018261687889153062),
('couture', 0.014296797144129373),
('herzog', 0.012963508351857129),
('marsigliese', 0.011700504737550367),
('dior', 0.011342545700525156),
('chanel', 0.010946621935943675),
('uur', 0.010346436447664793),
('dance', 0.00985195974234931),
('personage', 0.009312296236949976),
('show', 0.008737982146009193)],
46: [('google', 0.07643432514953216),
('zoekmachine', 0.033056215564139214),
('alphabet', 0.019387924412855893),
('microsoft', 0.016338812566557264),
('gemini', 0.01583634798907),
('gebruikers', 0.01435887055982529),
('data', 0.013659258361361741),
('page', 0.013637143391797817),
('bedrijf', 0.013016339938508734),
('bing', 0.012218660067047452)],
47: [('dnb', 0.09049556653438254),
('bunq', 0.0773111139510025),
('banken', 0.06678080768852333),
('witwassen', 0.03906454773714225),
('bank', 0.03765731808882712),
('toezichthouder', 0.03576303565089094),
('transacties', 0.028263292715932826),
('klanten', 0.024213681221675585),
('niknam', 0.021616513219079064),
('financiële', 0.02098139723558336)],
48: [('oekraïne', 0.03745135366486887),
('drone', 0.029988022131922327),
('oorlog', 0.02263246953352628),
('russische', 0.022308243942435965),
('oekraïense', 0.018523852949238345),
('militaire', 0.014836120841785599),
('rusland', 0.014142339813023532),
('thales', 0.0136247653725483),
('wapens', 0.01257684854528385),
('russen', 0.01103601481349522)],
49: [('privacy', 0.0323302433463922),
('avg', 0.019931577463772065),
('data', 0.019389952376528366),
('europese', 0.013993969536490932),
('gegevens', 0.012451953172739678),
('facebook', 0.012186627732606283),
('google', 0.01095620717871909),
('microklussers', 0.010938383526115498),
('advertenties', 0.01059946012757427),
('microklussen', 0.010342799095003359)],
50: [('vrouwen', 0.023235538471334766),
('mannen', 0.01812429203182792),
('seks', 0.01138792491076651),
('homo', 0.010774500149032848),
('vrouw', 0.009970067569380786),
('kosinski', 0.009262922035193216),
('epstein', 0.008992170771015374),
('seksuele', 0.007938273239968171),
('we', 0.007848067121773702),
('man', 0.007541549401764835)],
51: [('arbeidsmarkt', 0.0240093875688311),
('arbeidsproductiviteit', 0.0226865825153014),
('werknemers', 0.0142446414947354),
('werkgevers', 0.013746245749089841),
('productiviteit', 0.013568126038422916),
('economie', 0.013017484557473223),
('werken', 0.012720098482658183),
('productiviteitsgroei', 0.012697634902260226),
('nederland', 0.012001366274995382),
('sectoren', 0.0118802600855739)],
52: [('drone', 0.08841573992697767),
('ballon', 0.019045797278174313),
('vliegen', 0.01842712560128632),
('lucht', 0.018070006791596593),
('delft', 0.01400321999182485),
('riemens', 0.013409627783314447),
('ballonnen', 0.009389396196219282),
('middendorp', 0.009228414862249177),
('zegt', 0.009153844673451527),
('oostrum', 0.009111559560882151)],
53: [('virtuele', 0.030923558532187347),
('avatar', 0.030837969665672888),
('metaverse', 0.023516051794602096),
('virtual', 0.018480223260516418),
('reality', 0.018270828329401263),
('vr', 0.01413782954533946),
('digitale', 0.012786368879852576),
('metaversum', 0.011634082573884577),
('bril', 0.011332944348925194),
('virtueel', 0.011226196244377415)],
54: [('taal', 0.027963328622367185),
('translate', 0.027015390166831306),
('vertalers', 0.02343929571709614),
('vertalingen', 0.022155758866782886),
('talen', 0.020411989250542516),
('vertalen', 0.019297901992057564),
('gebarentaal', 0.017308988349071786),
('vertaling', 0.016832348458685337),
('google', 0.016748560934286873),
('arabisch', 0.01585642698951597)],
55: [('wapens', 0.0781336319779496),
('autonome', 0.05072535314780074),
('robot', 0.028747503311168317),
('killer', 0.025482076120396695),
('verbod', 0.018469299357596825),
('drone', 0.015600537299756457),
('autonoom', 0.014290267828753228),
('pax', 0.013871531406333312),
('wapensystemen', 0.013510335453636288),
('brief', 0.013507795228199558)],
56: [('spelers', 0.026279769356587154),
('club', 0.024277135109256463),
('bal', 0.02204047737100079),
('voetbal', 0.020550381467079316),
('scisports', 0.018078381142403635),
('sport', 0.01754475511944718),
('wedstrijden', 0.015215056840477597),
('ajax', 0.01354937635238009),
('brouwer', 0.013526641347020819),
('wedstrijd', 0.013519471468271333)],
57: [('algoritme', 0.04389544885480664),
('algoritmes', 0.04153184732150761),
('algoritmen', 0.03678659743190971),
('beslissingen', 0.019003189445809726),
('fry', 0.01570488929149069),
('mensen', 0.013292812246911414),
('algoritmische', 0.012152977993770929),
('we', 0.011965433829654338),
('menselijke', 0.010417969542037204),
('fouten', 0.009323198409820235)],
58: [('klm', 0.04325929006531934),
('vliegtuig', 0.030714700310642634),
('vliegtuigen', 0.026888690808260667),
('schiphol', 0.02250176884540963),
('passagiers', 0.0186936508232281),
('rintel', 0.018078236084303693),
('vliegen', 0.017437474235092724),
('piloot', 0.01655168472801233),
('luchthaven', 0.01482464204808255),
('toestellen', 0.012897553770407291)],
59: [('stem', 0.03099679375851028),
('tinnitus', 0.029597134232338985),
('geluid', 0.018022601058312163),
('ridder', 0.014544277903284615),
('whispp', 0.013006014233020669),
('via', 0.012669533327183614),
('starkey', 0.01243584641985006),
('castermans', 0.011758726578725362),
('serdijn', 0.010383303466736547),
('sawalich', 0.010383303466736547)],
60: [('hacker', 0.027636735308037255),
('moerel', 0.020097568469805694),
('bedrijven', 0.012031390771884577),
('cybersecurity', 0.012010053289878135),
('criminelen', 0.01113611810453505),
('website', 0.0101717402378479),
('cybersprint', 0.009740114424781496),
('phishing', 0.009196243438149872),
('ransomware', 0.008835109119620039),
('internet', 0.008801859853171294)],
61: [('altman', 0.14102119698967272),
('openai', 0.1015710325300196),
('bestuur', 0.05747783384413201),
('sam', 0.038525689131678136),
('ontslag', 0.03568949222590706),
('topman', 0.03261254722196201),
('brockman', 0.03193199604837349),
('soros', 0.0277330150238057),
('microsoft', 0.025840846128625673),
('sutskever', 0.022742284100814896)],
62: [('wapens', 0.037579541031739806),
('autonome', 0.03184030739144296),
('militaire', 0.02264954938527403),
('drone', 0.021960913663308154),
('defensie', 0.016309512250959383),
('wapensystemen', 0.015821213182588167),
('ai', 0.014663109982916516),
('militairen', 0.01329124972569004),
('systemen', 0.011888935495414376),
('verdrag', 0.010969135966135834)],
63: [('cloud', 0.07169687990321258),
('microsoft', 0.03731169961703853),
('amazon', 0.024516316933465624),
('oracle', 0.024012023188955206),
('google', 0.02338510395534385),
('bedrijven', 0.01962741711526994),
('data', 0.017917529013155954),
('clouddiensten', 0.017832008943814242),
('azure', 0.01588130498704494),
('mrd', 0.01418703034539035)],
64: [('chainalysis', 0.031963491878098646),
('cryptomunten', 0.027006183239194598),
('bitcoin', 0.02068948583400805),
('coinbase', 0.016489240726289874),
('worldcoin', 0.015082309304071942),
('bitcoins', 0.01497978318632785),
('gronager', 0.013531356710740879),
('criminelen', 0.013153100927644957),
('cryptomunt', 0.01287562658619239),
('cryptovaluta', 0.01170242524877489)],
65: [('datacenters', 0.09250453908788336),
('datacentra', 0.024567228169454766),
('datacenter', 0.02375277534878531),
('stroom', 0.023305579912576445),
('dda', 0.022812290526099153),
('nederland', 0.02236831597883887),
('grove', 0.01835216162280343),
('apparatuur', 0.01699588264289452),
('amsterdam', 0.014118266961348285),
('huawei', 0.012833675795747622)],
66: [('ai', 0.024373660359767502),
('sheikh', 0.022911151137812825),
('zelfrijdende', 0.019768704283732595),
('intelligentie', 0.0193889254594412),
('kunstmatige', 0.018835880289904888),
('wrr', 0.01845407464686386),
('afm', 0.017954740887184673),
('auto', 0.0173475210179342),
('dobbelaere', 0.01612973953465127),
('rapport', 0.015302639539079448)],
67: [('advocaten', 0.047869690048671906),
('juridische', 0.045160477643869414),
('legal', 0.02942688694104348),
('advocaat', 0.02693618278359918),
('kantoren', 0.024629087472820704),
('advocatuur', 0.022606747533347554),
('advocatenkantoren', 0.02123340688089756),
('legaltech', 0.018696998886393815),
('juristen', 0.015982974519594486),
('overy', 0.015654107330778065)],
68: [('kandidaten', 0.04367373625548949),
('recruiter', 0.04059988443337879),
('cv', 0.02841701933068386),
('kandidaat', 0.023956661099390424),
('sollicitanten', 0.020445619350926883),
('oostrom', 0.019436682990122387),
('algoritme', 0.01736537713714318),
('akhlal', 0.016339307318545562),
('unilever', 0.013573547331955952),
('sollicitant', 0.012513266790041628)],
69: [('ai', 0.04440479196872834),
('risico', 0.027107681214019953),
('sunak', 0.022799892252258223),
('bletchley', 0.019591834607778892),
('britse', 0.015207195899922171),
('wetgeving', 0.014117647977543557),
('technologie', 0.012597777224588965),
('top', 0.011148902166749783),
('we', 0.010859002736126798),
('park', 0.010810450500599724)],
70: [('russische', 0.08139310949383995),
('volozj', 0.07013517038729875),
('yandex', 0.0516088989642387),
('aivd', 0.03746322566718366),
('spionage', 0.03694751963616592),
('diplomaten', 0.033952971870485016),
('russen', 0.02939391204516434),
('rusland', 0.028250598128174105),
('sanctielijst', 0.02713202981735881),
('ambassade', 0.026961131735766673)],
71: [('musk', 0.06650748951511597),
('tesla', 0.030365478445581934),
('spacex', 0.028314404380681616),
('elon', 0.026090808052061758),
('aarde', 0.020386576924022716),
('mar', 0.018498118552802065),
('raket', 0.01755896371306701),
('isaacson', 0.01441471552480549),
('cooijmans', 0.014198992401643771),
('ruimtevaart', 0.013119357697234069)],
72: [('politie', 0.04781282485078935),
('plas', 0.03362713788792385),
('criminelen', 0.01928401973533707),
('gegevens', 0.017024873007959004),
('criminaliteit', 0.016655551915115818),
('cybercrime', 0.016388437673752608),
('aangifte', 0.01623265590662635),
('wolfert', 0.014087950454538587),
('aangiftes', 0.01339569906638745),
('cold', 0.012624545326596286)],
73: [('chinese', 0.0430256874650782),
('china', 0.03808194540686919),
('universiteiten', 0.03361741992495753),
('campus', 0.030269088053335946),
('boekhoorn', 0.025857127204376187),
('samenwerking', 0.024108856450804114),
('jorritsma', 0.02396477886377274),
('studenten', 0.02133448062699776),
('nederlandse', 0.02090022547250005),
('onderzoek', 0.017712554526376992)],
74: [('loznitsa', 0.019922144019883646),
('hoofddoek', 0.014497285714296558),
('facebook', 0.013162652329102157),
('ressa', 0.012158517472742275),
('regime', 0.012046000234721532),
('vrouwen', 0.011851095935147798),
('hamas', 0.011421999480738727),
('cvz', 0.01092148489653135),
('russische', 0.01082440582285944),
('klette', 0.01067176084111041)],
75: [('poolwervel', 0.03702436451447165),
('muon', 0.03207256922877086),
('satellieten', 0.03150133731368994),
('aarde', 0.02798953606905963),
('meteorieten', 0.02547428486986456),
('satelliet', 0.022027462301644433),
('methaan', 0.022008983824331148),
('noordpool', 0.021245578681822707),
('sark', 0.018425023697900594),
('hyperscout', 0.01720848573053478)],
76: [('smartphone', 0.050619241523059626),
('lens', 0.034953763007363144),
('telefoon', 0.02439119070269335),
('wouter', 0.021235008917102158),
('mobieltjes', 0.020990203733791027),
('google', 0.018999667553875284),
('pixel', 0.016500596008511864),
('leerlingen', 0.01629585114799356),
('iphone', 0.016234694789581377),
('verbod', 0.015075568259468791)],
77: [('hawking', 0.054165272806966425),
('andringa', 0.03381514166296401),
('wilczek', 0.019609269884495677),
('natuurkunde', 0.017666774996546227),
('boek', 0.012931733538506315),
('stephen', 0.012856808560909668),
('wetenschap', 0.012085276855272102),
('heelal', 0.01019552655545034),
('generalisten', 0.009757623348562427),
('zwarte', 0.009225484331909157)],
78: [('biesheuvel', 0.04002041556629835),
('ema', 0.03646489363433604),
('woerdt', 0.01755586145323328),
('nederland', 0.01603492718013505),
('wehkamp', 0.014857323518077071),
('ramakers', 0.013283343343572623),
('gamecongres', 0.011812053215886353),
('unilever', 0.011720414926673736),
('ondernemers', 0.01168674012791996),
('indigo', 0.011456108602901477)],
79: [('chip', 0.042423350031513096),
('asml', 0.03529431417119637),
('intel', 0.029005074453676803),
('europa', 0.028832796644165416),
('europese', 0.021069444553720083),
('fabriek', 0.0203544216823179),
('nauta', 0.017472708701010028),
('wennink', 0.01741550066316771),
('asmi', 0.016370730206957215),
('miljard', 0.014511816497356883)],
80: [('krant', 0.03526574497927175),
('dpg', 0.027727130991532525),
('kranten', 0.02610479077986151),
('nieuws', 0.015333359359573237),
('lezers', 0.014773755648927208),
('thillo', 0.014597649796703811),
('bild', 0.01398253932227877),
('guardian', 0.013089257924132314),
('oktober', 0.012519500697486082),
('medium', 0.01223343327041974)],
81: [('bussche', 0.04526674649416625),
('podcast', 0.045228788729228614),
('podcasts', 0.034623072743861866),
('rogan', 0.027704154029604782),
('den', 0.025011571402178096),
('spotify', 0.022400191829297977),
('kwebbelkop', 0.015736803370334486),
('jordi', 0.015337340265690568),
('youtube', 0.013017907716950761),
('strengholt', 0.012754523076080819)],
82: [('huawei', 0.0983974968933943),
('universiteiten', 0.04623969139965946),
('chinese', 0.038498490269710854),
('samenwerking', 0.03437739583316361),
('harmelen', 0.026495051279462948),
('instituten', 0.021956172817828454),
('china', 0.021786305940049717),
('confucius', 0.017183473976952867),
('wetenschappers', 0.01712817604018451),
('vu', 0.0163415435985534)],
83: [('amazon', 0.09375427812943085),
('bezos', 0.03355596884906663),
('platform', 0.02388895718633598),
('jeff', 0.013371015778371619),
('anthropic', 0.012880667475770755),
('duitsland', 0.012483738294385432),
('werknemers', 0.012084724462030287),
('vogels', 0.011922113873336462),
('alexa', 0.010937747321506725),
('choudary', 0.010830737007990594)],
84: [('bamps', 0.025203427950659676),
('data', 0.01610127780589552),
('source', 0.013595740898488343),
('onze', 0.012387276927014952),
('roman', 0.011638330436690856),
('koenig', 0.01116036588330626),
('we', 0.010982185457777741),
('meta', 0.010439373130131324),
('open', 0.010409098878651551),
('russell', 0.010360852575021541)],
85: [('twitter', 0.08520151960524652),
('musk', 0.05443472880353841),
('nepaccounts', 0.031888808778552075),
('tweet', 0.027872284047031078),
('mastodon', 0.027746572035183388),
('account', 0.018344430302111348),
('socialemediabedrijven', 0.016696621443996362),
('elon', 0.01600870568006368),
('medium', 0.01542855180234373),
('gebruikers', 0.01515647363228872)],
86: [('god', 0.024611223189918593),
('ai', 0.014788128259455668),
('blok', 0.014619879911181298),
('broersen', 0.014168532376730833),
('harari', 0.013308997315482353),
('men', 0.013267570219735491),
('we', 0.011783620641204228),
('bostrom', 0.0110896133746554),
('goddelijke', 0.011030459931300423),
('mensen', 0.010863308843100734)],
87: [('youtube', 0.08270607943481365),
('video', 0.034412667392502365),
('filmpjes', 0.0279971171361612),
('facebook', 0.0216824135093742),
('tang', 0.02167909936999572),
('hoven', 0.017641962554577915),
('inhoud', 0.01702017339353581),
('kinderen', 0.01615454961349382),
('moderator', 0.014701635462148263),
('platformen', 0.0138363228933664)],
88: [('online', 0.0224510221865735),
('unless', 0.01862990139978048),
('concept', 0.01585963649283838),
('klanten', 0.013750396304005747),
('prevoo', 0.013623565578826909),
('wappzapp', 0.013623565578826909),
('klant', 0.013433849455625677),
('nagtegaal', 0.013405432896362812),
('ezrachi', 0.012853633983237679),
('veelbelovend', 0.012774977006155644)],
89: [('aardbevingen', 0.03138213977702712),
('hooper', 0.026675417540499822),
('aardbeving', 0.025881833909104462),
('vulkanen', 0.017921012137610437),
('johnson', 0.017482656771525853),
('voorspellen', 0.016458889294327133),
('extreem', 0.016453868566745643),
('vulkaan', 0.01481967641138879),
('model', 0.014703219559803919),
('aardplaten', 0.014587569609209568)],
90: [('universiteiten', 0.03767973971984582),
('spionage', 0.03253955844351508),
('chinese', 0.02717989517608118),
('hikvision', 0.026863879311577233),
('loket', 0.02597386549937508),
('diercks', 0.023562113419335676),
('nctv', 0.02130248046495377),
('kennis', 0.020581128409258003),
('mivd', 0.020456442475896754),
('samenwerkingen', 0.01906822291910119)]}
model.visualize_barchart(top_n_topics=90)
topic_details_df
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 2498 | we_jaar_mensen_wel | [we, jaar, mensen, wel, intelligentie, zegt, k... | [kunstmatige intelligentie best bedreigend kun... |
| 1 | 0 | 432 | robot_men_mensen_we | [robot, men, mensen, we, machine, intelligenti... | [tekenfilm maken gaat ai stuk sneller animatie... |
| 2 | 1 | 147 | patiënten_patiënt_zorg_artsen | [patiënten, patiënt, zorg, artsen, medische, p... | [kunstmatige intelligentie verslaat artsen ops... |
| 3 | 2 | 136 | auto_rijden_zelfrijdende_tesla | [auto, rijden, zelfrijdende, tesla, elektrisch... | [robot betere kunstmatige intelligentie zelfri... |
| 4 | 3 | 123 | beleggers_rente_inflatie_aex | [beleggers, rente, inflatie, aex, aandelen, am... | [oorlogen trage economische groei laten belegg... |
| ... | ... | ... | ... | ... | ... |
| 87 | 86 | 12 | god_ai_blok_broersen | [god, ai, blok, broersen, harari, men, we, bos... | [bedreigt kunstmatige intelligentie godsbeeld ... |
| 88 | 87 | 12 | youtube_video_filmpjes_facebook | [youtube, video, filmpjes, facebook, tang, hov... | [youtube grijpt mountain view youtube afgelope... |
| 89 | 88 | 11 | online_unless_concept_klanten | [online, unless, concept, klanten, prevoo, wap... | [online gaat winkelstraat nooit dicht verslave... |
| 90 | 89 | 10 | aardbevingen_hooper_aardbeving_vulkanen | [aardbevingen, hooper, aardbeving, vulkanen, j... | [lezersreacties tof krant nieuws mei aandacht ... |
| 91 | 90 | 10 | universiteiten_spionage_chinese_hikvision | [universiteiten, spionage, chinese, hikvision,... | [laat nederland speelbal minister generaals pr... |
92 rows × 5 columns
Based on analyzing the topics and content, the following topics were manually merged:
# Merge the specified topics
topics_to_merge = [[1, 12, 19, 33, 50, 54, 56, 59],
[8, 58, 75, 78, 89],
[7, 13, 22, 25, 36, 45],
[14, 39, 85, 88, 80 ,81, 87],
[3, 6, 15, 16, 43, 47, 51, 64, 68],
[24, 34, 35, 42, 52, 55, 60, 62, 67, 72],
[5, 10, 11, 21, 28, 40, 41, 48, 49, 66, 69, 70, 74, 79, 23],
[31, 44, 73, 76, 77, 82, 86, 90],
[0, 2, 9, 27, 29, 53, 57, 65, 84],
[4, 17, 18, 20, 26, 30, 32, 37, 38, 46, 61, 63, 71, 83]]
model.merge_topics(content_titles, topics_to_merge)
# Get the topics and their representations
topics_info = model.get_topic_info()
# Save the topics information DataFrame to an Excel file
topics_info.to_excel('/Users/helgegeurtjacobusmoes/Desktop/thesis data/Merged_Topic_Info.xlsx', index=False)
# Save the topics information DataFrame to a CSV file
topics_info.to_csv('/Users/helgegeurtjacobusmoes/Desktop/thesis data/Merged_Topic_Info.csv', index=False)
Topics arranged:
model.get_topics()
{-1: [('we', 0.022246328484310914),
('jaar', 0.014825560535742755),
('mensen', 0.014233405312292684),
('wel', 0.013011375234743543),
('intelligentie', 0.01268669676667743),
('kunstmatige', 0.012182137624903732),
('zegt', 0.012007260891035156),
('nieuwe', 0.011391891814129043),
('gaat', 0.010897747908243225),
('moeten', 0.01054144716387557)],
0: [('robot', 0.040747440280944944),
('we', 0.027470876669336355),
('mensen', 0.019451643909996912),
('auto', 0.019356006306047474),
('intelligentie', 0.018192291740770892),
('kunstmatige', 0.016748266185515403),
('men', 0.014375584320160421),
('wel', 0.013622172170076222),
('jaar', 0.01286155054987591),
('nieuwe', 0.012393326194674798)],
1: [('china', 0.028164840461059135),
('europese', 0.022046840430905077),
('we', 0.019394168050648072),
('europa', 0.017627095347032613),
('chinese', 0.014903966734085599),
('moeten', 0.014246470556257248),
('nederland', 0.014052009316309339),
('bedrijven', 0.014033153058774818),
('jaar', 0.012712776676954494),
('ai', 0.012300241160117346)],
2: [('bedrijf', 0.020442955453832214),
('microsoft', 0.019825020816631674),
('google', 0.019515951347540167),
('chatgpt', 0.018515860464228668),
('chip', 0.01836989055946684),
('jaar', 0.01697677405734086),
('facebook', 0.01685282738055883),
('apple', 0.0160507727582351),
('asml', 0.015606122255538464),
('intelligentie', 0.014374516867028383)],
3: [('jaar', 0.023184081686227948),
('banken', 0.01761774364528032),
('bedrijven', 0.016916932469397013),
('bank', 0.01666501615891372),
('geld', 0.015093450809297216),
('beleggers', 0.014860654563712003),
('bedrijf', 0.014705946804651981),
('we', 0.01390273161684443),
('nieuwe', 0.013110427064134428),
('gaat', 0.012000621949861656)],
4: [('we', 0.021930547602218842),
('zegt', 0.014452921571927594),
('mensen', 0.014238437413663402),
('patiënten', 0.01378244587698588),
('wel', 0.013054381290297448),
('jaar', 0.012210726807541672),
('kunstmatige', 0.01197661314466977),
('intelligentie', 0.011900529992663087),
('patiënt', 0.011619452284720125),
('gaat', 0.010728236808870091)],
5: [('film', 0.01847755447647912),
('the', 0.016350241210528083),
('we', 0.01543026679492675),
('muziek', 0.012888694380478011),
('wel', 0.012334602999807629),
('intelligentie', 0.012275216231019716),
('kunstmatige', 0.011649908596628874),
('jaar', 0.011306595788701196),
('werk', 0.011084767144725113),
('wereld', 0.010643657791755752)],
6: [('we', 0.019330371749468046),
('foto', 0.01711297029532976),
('wapens', 0.016277234436833318),
('drone', 0.015328767286442876),
('mensen', 0.01504924850381791),
('zegt', 0.014326810144608688),
('kunstmatige', 0.013447552842361231),
('intelligentie', 0.013419206847327513),
('maken', 0.013239187128046752),
('wel', 0.01293068359901015)],
7: [('spel', 0.018312580198690025),
('we', 0.017890999492087493),
('computer', 0.017318754694953524),
('game', 0.016736522741384936),
('mensen', 0.01566625183046286),
('schaken', 0.01417092688063822),
('intelligentie', 0.013726519506834602),
('zegt', 0.01339559450882493),
('jaar', 0.01325947556191445),
('kunstmatige', 0.013200768323543527)],
8: [('studenten', 0.029645865440638606),
('universiteiten', 0.02785055605586397),
('onderwijs', 0.026812243473693962),
('we', 0.020442958389444876),
('chinese', 0.017319446072270553),
('universiteit', 0.01653292988117574),
('zegt', 0.01483222235448319),
('china', 0.014413560905648978),
('huawei', 0.014151140263779867),
('wel', 0.014024755186748915)],
9: [('we', 0.028868686909911203),
('dieren', 0.027081379669085564),
('jaar', 0.014316680112669084),
('zegt', 0.013786552606789631),
('waar', 0.012236350448078267),
('gaat', 0.012000136818076594),
('wel', 0.011432472326371021),
('zien', 0.011080247028019128),
('natuur', 0.010665189256561906),
('data', 0.010562430487446218)]}
#Associates the assigned topics and their probabilities with each news article in the DataFrame
data['Topic'] = topics
data['Probabilities'] = probabilities
topics = data['Topic'].unique()
print(topics)
[ 9 69 -1 86 8 28 0 24 1 25 4 61 2 21 46 74 22 75 35 53 12 13 29 39 89 56 54 44 36 32 50 42 45 11 10 7 59 18 23 5 57 31 66 68 20 62 64 16 84 15 37 3 88 51 85 67 90 33 30 48 65 27 72 40 41 55 6 60 26 70 14 38 19 49 34 58 76 71 83 80 81 47 77 52 79 17 63 73 43 78 87 82]
-1 = AI in society: -1 0 = 1: 0, 2: 0, 9: 0, 27: 0, 29: 0, 53: 0, 57: 0, 65: 0, 84: 0, 1 = 5: 1, 10: 1, 11: 1, 21: 1, 28: 1, 40: 1, 41: 1, 48: 1, 49: 1, 66: 1, 69: 1, 70: 1, 74: 1, 79: 1, 23: 1 2 = 4: 2, 17: 2, 18: 2, 20: 2, 26: 2, 30: 2, 32: 2, 37: 2, 38: 2, 46: 2, 61: 2, 63: 2, 71: 2, 83:2 3 = 3: 3, 6: 3, 15: 3, 16: 3, 43: 3, 47: 3, 51: 3, 64: 3, 68: 3 4 = 1: 4, 12: 4, 19: 4, 33: 4, 50: 4, 54: 4, 56: 4, 59: 4 5 = 13: 5, 22: 5, 25: 5, 36: 5, 45: 5, 81: 5, 87: 5 6 = 24: 6, 34: 6, 35: 6, 42: 6, 52: 6, 55: 6, 60: 6, 62: 6, 67: 6, 72: 6 7 = 14: 7, 39: 7, 85: 7, 88: 7, 80: 7, 81: 7, 87: 7 8 = 31: 8, 44: 8, 73: 8, 76: 8, 77: 8, 82: 8, 86: 8, 90: 8 9 = 8: 9, 58: 9, 75: 9, 78: 9, 89: 9
data_updated = data.copy()
data_updated['Topic'] = data_updated['Topic'].replace({
1: 0, 2: 0, 9: 0, 27: 0, 29: 0, 53: 0, 57: 0, 65: 0, 84: 0,
5: 1, 10: 1, 11: 1, 21: 1, 28: 1, 40: 1, 41: 1, 48: 1, 49: 1, 66: 1, 69: 1, 70: 1, 74: 1, 79: 1, 23: 1,
4: 2, 17: 2, 18: 2, 20: 2, 26: 2, 30: 2, 32: 2, 37: 2, 38: 2, 46: 2, 61: 2, 63: 2, 71: 2, 83:2,
3: 3, 6: 3, 15: 3, 16: 3, 43: 3, 47: 3, 51: 3, 64: 3, 68: 3,
1: 4, 12: 4, 19: 4, 33: 4, 50: 4, 54: 4, 56: 4, 59: 4,
13: 5, 22: 5, 25: 5, 36: 5, 45: 5, 81: 5, 87: 5,
24: 6, 34: 6, 35: 6, 42: 6, 52: 6, 55: 6, 60: 6, 62: 6, 67: 6, 72: 6,
14: 7, 39: 7, 85: 7, 88: 7, 80: 7, 81: 7, 87: 7,
31: 8, 44: 8, 73: 8, 76: 8, 77: 8, 82: 8, 86: 8, 90: 8,
8: 9, 58: 9, 75: 9, 78: 9, 89: 9
})
data_updated
| Headline | Publication | URL | News Outlet | Type of News | Word Count | Body | Publication Date | Combined | Topic | Probabilities | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Nee, kunstmatige intelligentie gaat ons niet u... | Trouw, Verdieping; Blz. 4, 5, 2044 words | https://advance.lexis.com/api/document?collect... | Trouw | Verdieping | 2044 | Welkom in de AI-fabriek serie\nDat kunstmatige... | 07-12-2023 | nee kunstmatige intelligentie gaat uitroeien w... | 0 | 0.716202 |
| 1 | Wereldleiders zoeken grip op kunstmatige intel... | Trouw, Vandaag; Blz. 6, 528 words | https://advance.lexis.com/api/document?collect... | Trouw | Vandaag | 528 | Op het Britse landgoed Bletchley Park werden t... | 03-11-2023 | wereldleiders zoeken grip kunstmatige intellig... | 1 | 1.000000 |
| 2 | Kunstmatige intelligentie is best bedreigend | Trouw, Tijdgeest; Blz. 8, 576 words | https://advance.lexis.com/api/document?collect... | Trouw | Tijdgeest | 576 | Of kunstmatige intelligentie nuttig is (Tijdge... | 13-05-2023 | kunstmatige intelligentie best bedreigend kuns... | -1 | 0.000000 |
| 3 | Mensen zijn een stuk efficiënter dan kunstmati... | Trouw, Vandaag; Blz. 3, 741 words | https://advance.lexis.com/api/document?collect... | Trouw | Vandaag | 741 | De wereld raakte het afgelopen jaar in de ban ... | 21-10-2023 | mensen stuk efficiënter kunstmatige intelligen... | -1 | 0.000000 |
| 4 | Bedreigt kunstmatige intelligentie ons godsbeeld? | Trouw, Religie en Filosofie; Blz. 8, 9, 1367 w... | https://advance.lexis.com/api/document?collect... | Trouw | Religie en Filosofie | 1367 | Theologisch elftal\n'In het begin was het Woor... | 16-12-2022 | bedreigt kunstmatige intelligentie godsbeeld t... | 8 | 1.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6441 | De rauwe realiteit | Het Financieele Dagblad, MORGEN; Blz. 4, 2920 ... | https://advance.lexis.com/api/document?collect... | Het Financieele Dagblad | MORGEN | 2920 | Grootse oplossingen\nDrie stedelijke 'ontwrich... | 14-10-2017 | rauwe realiteit grootse oplossingen drie stede... | -1 | 0.000000 |
| 6442 | No Headline In Original | Het Financieele Dagblad, PAGINA 13; Blz. 13, 1... | https://advance.lexis.com/api/document?collect... | Het Financieele Dagblad | PAGINA | 114 | klinkt als muziek\nDe Walkman, van Sony, is vo... | 29-04-2023 | no headline original klinkt muziek walkman son... | -1 | 0.000000 |
| 6443 | Groeten uit het hart van de hightech | Het Financieele Dagblad, WEEKEND; Blz. 6, 2799... | https://advance.lexis.com/api/document?collect... | Het Financieele Dagblad | WEEKEND | 2799 | Het is zover voor 'onze man in San Francisco'.... | 20-08-2016 | groeten hart hightech zover man san francisco ... | -1 | 0.000000 |
| 6444 | De complete lijst Jonge Talenten 2019 | Het Financieele Dagblad, FD PERSOONLIJK; Arbei... | https://advance.lexis.com/api/document?collect... | Het Financieele Dagblad | FD PERSOONLIJK; Arbeidsmarkt | 8007 | Rebel werkte zes jaar bij zakenbank Morgan Sta... | 17-01-2019 | complete lijst jonge talenten rebel werkte z j... | 1 | 0.880587 |
| 6445 | No Headline In Original | Het Financieele Dagblad, DE WERELD; Blz. 30, 9... | https://advance.lexis.com/api/document?collect... | Het Financieele Dagblad | DE WERELD | 969 | The Conversation (Londen)Gates Notes (VS)The E... | 08-12-2018 | no headline original the conversation londen g... | -1 | 0.000000 |
6446 rows × 11 columns
# Define the topic name mapping dictionary
topic_names = {
-1: 'AI in Society',
0: 'Technology',
1: 'Politics',
2: 'Business',
3: 'Economy',
4: 'Healthcare',
5: 'Art',
6: 'Law',
7: 'Media',
8: 'Education',
9: 'Environment'
}
# Create a new column 'Topic Name' by mapping the 'Topic' column using the topic_names dictionary
data_updated['Topic Name'] = data_updated['Topic'].map(topic_names)
data_updated
| Headline | Publication | URL | News Outlet | Type of News | Word Count | Body | Publication Date | Combined | Topic | Probabilities | Topic Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Nee, kunstmatige intelligentie gaat ons niet u... | Trouw, Verdieping; Blz. 4, 5, 2044 words | https://advance.lexis.com/api/document?collect... | Trouw | Verdieping | 2044 | Welkom in de AI-fabriek serie\nDat kunstmatige... | 07-12-2023 | nee kunstmatige intelligentie gaat uitroeien w... | 0 | 0.716202 | Technology |
| 1 | Wereldleiders zoeken grip op kunstmatige intel... | Trouw, Vandaag; Blz. 6, 528 words | https://advance.lexis.com/api/document?collect... | Trouw | Vandaag | 528 | Op het Britse landgoed Bletchley Park werden t... | 03-11-2023 | wereldleiders zoeken grip kunstmatige intellig... | 1 | 1.000000 | Politics |
| 2 | Kunstmatige intelligentie is best bedreigend | Trouw, Tijdgeest; Blz. 8, 576 words | https://advance.lexis.com/api/document?collect... | Trouw | Tijdgeest | 576 | Of kunstmatige intelligentie nuttig is (Tijdge... | 13-05-2023 | kunstmatige intelligentie best bedreigend kuns... | -1 | 0.000000 | AI in Society |
| 3 | Mensen zijn een stuk efficiënter dan kunstmati... | Trouw, Vandaag; Blz. 3, 741 words | https://advance.lexis.com/api/document?collect... | Trouw | Vandaag | 741 | De wereld raakte het afgelopen jaar in de ban ... | 21-10-2023 | mensen stuk efficiënter kunstmatige intelligen... | -1 | 0.000000 | AI in Society |
| 4 | Bedreigt kunstmatige intelligentie ons godsbeeld? | Trouw, Religie en Filosofie; Blz. 8, 9, 1367 w... | https://advance.lexis.com/api/document?collect... | Trouw | Religie en Filosofie | 1367 | Theologisch elftal\n'In het begin was het Woor... | 16-12-2022 | bedreigt kunstmatige intelligentie godsbeeld t... | 8 | 1.000000 | Education |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6441 | De rauwe realiteit | Het Financieele Dagblad, MORGEN; Blz. 4, 2920 ... | https://advance.lexis.com/api/document?collect... | Het Financieele Dagblad | MORGEN | 2920 | Grootse oplossingen\nDrie stedelijke 'ontwrich... | 14-10-2017 | rauwe realiteit grootse oplossingen drie stede... | -1 | 0.000000 | AI in Society |
| 6442 | No Headline In Original | Het Financieele Dagblad, PAGINA 13; Blz. 13, 1... | https://advance.lexis.com/api/document?collect... | Het Financieele Dagblad | PAGINA | 114 | klinkt als muziek\nDe Walkman, van Sony, is vo... | 29-04-2023 | no headline original klinkt muziek walkman son... | -1 | 0.000000 | AI in Society |
| 6443 | Groeten uit het hart van de hightech | Het Financieele Dagblad, WEEKEND; Blz. 6, 2799... | https://advance.lexis.com/api/document?collect... | Het Financieele Dagblad | WEEKEND | 2799 | Het is zover voor 'onze man in San Francisco'.... | 20-08-2016 | groeten hart hightech zover man san francisco ... | -1 | 0.000000 | AI in Society |
| 6444 | De complete lijst Jonge Talenten 2019 | Het Financieele Dagblad, FD PERSOONLIJK; Arbei... | https://advance.lexis.com/api/document?collect... | Het Financieele Dagblad | FD PERSOONLIJK; Arbeidsmarkt | 8007 | Rebel werkte zes jaar bij zakenbank Morgan Sta... | 17-01-2019 | complete lijst jonge talenten rebel werkte z j... | 1 | 0.880587 | Politics |
| 6445 | No Headline In Original | Het Financieele Dagblad, DE WERELD; Blz. 30, 9... | https://advance.lexis.com/api/document?collect... | Het Financieele Dagblad | DE WERELD | 969 | The Conversation (Londen)Gates Notes (VS)The E... | 08-12-2018 | no headline original the conversation londen g... | -1 | 0.000000 | AI in Society |
6446 rows × 12 columns
# Check if all topics have been proper labeled
topics = data_updated['Topic'].unique()
print(topics)
[ 0 1 -1 8 9 6 4 5 2 7 3]
# Calculate the frequency of topics
topic_frequency = data_updated['Topic'].value_counts()
for topic, frequency in topic_frequency.items():
print("Topic: ", topic)
print("Frequency: ", frequency)
print()
Topic: -1 Frequency: 2498 Topic: 0 Frequency: 821 Topic: 1 Frequency: 595 Topic: 2 Frequency: 594 Topic: 3 Frequency: 476 Topic: 4 Frequency: 406 Topic: 6 Frequency: 279 Topic: 7 Frequency: 251 Topic: 5 Frequency: 231 Topic: 8 Frequency: 151 Topic: 9 Frequency: 144
# Get the topic labels and frequencies
labels = topic_frequency.index
counts = topic_frequency.values
# Color palette adjusted to the specific colors used in the line graph
color_palette = ['brown', 'darkorange', 'green', 'red', 'purple', 'royalblue', 'yellow', 'darkcyan', 'slategray', 'lightseagreen', 'hotpink']
# Create a pie chart
plt.figure(figsize=(8, 8))
plt.pie(counts, labels=labels, autopct='%1.1f%%', colors=color_palette)
plt.title('Topic Frequency')
# Display the pie chart
plt.show()
# Get topics per source
topics_per_source = model.topics_per_class(content_titles, classes=data_updated['News Outlet'])
# Visualize topics per source
model.visualize_topics_per_class(topics_per_source)